#!/usr/bin/env python
# -*- mode: python; coding: utf-8 -*-

# FIXME: this will split multiline and mixed
# (with ? and . after it) sentences

dot = unicode("。", "utf-8")
exc = unicode("!", "utf-8")
ask = unicode("？", "utf-8")

def extract(filename):
    pairs = set()
    current = None
    i = 0

    for l in open(filename):
        l = unicode(l, "utf-8")
        if i == 0:
            if ask in l:
                current = (l[:l.rindex(ask)+1].strip(),)
                i += 1
            elif exc in l:
                current = (l[:l.rindex(exc)+1].strip(),)
                i += 1
            elif dot in l:
                current = (l[:l.rindex(dot)+1].strip(),)
                i += 1
        elif i == 2:
            current = (current[0], l[:-1].strip())
            pairs.add(current)
            i = 0
        else:
            i += 1

    for k,v in pairs:
        print "HZ:", k
        print "PY:", v
        print "-"*50

if __name__ == "__main__":
    import sys
    if len(sys.argv) != 2:
        print "USAGE: %s <text_file>" % sys.argv[0]
        sys.exit(-1)

    extract(sys.argv[1])
